Importing libraries

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import plotly
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.offline
import warnings
warnings.filterwarnings('ignore')
px.set_mapbox_access_token(open(".mapbox_token").read())
cnf, dth, rec, act = '#ff2e63', '#a9a9a9', '#21bf73', '#fe9801'

Importing and formatting the data

In [2]:
whole = pd.read_csv("covid_19_clean_complete.csv", parse_dates=['Date'])
In [3]:
whole.head()
Out[3]:
Province/State Country/Region Lat Long Date Confirmed Deaths Recovered
0 NaN Afghanistan 33.0000 65.0000 2020-01-22 0 0 0
1 NaN Albania 41.1533 20.1683 2020-01-22 0 0 0
2 NaN Algeria 28.0339 1.6596 2020-01-22 0 0 0
3 NaN Andorra 42.5063 1.5218 2020-01-22 0 0 0
4 NaN Angola -11.2027 17.8739 2020-01-22 0 0 0
In [4]:
whole.columns = ['State', 'Country', 'Lat', 'Long', 'Date', 'Confirmed',
       'Deaths', 'Recovered']
In [5]:
whole.isnull().sum()
Out[5]:
State        13172
Country          0
Lat              0
Long             0
Date             0
Confirmed        0
Deaths           0
Recovered        0
dtype: int64
In [6]:
whole.shape
Out[6]:
(19018, 8)
In [7]:
whole.State = whole.State.fillna('')
whole['Active'] = whole.Confirmed - whole.Deaths - whole.Recovered
In [8]:
whole[['Confirmed', 'Deaths', 'Recovered', 'Active']] = whole[['Confirmed', 'Deaths', 'Recovered', 'Active']].fillna(0)
In [9]:
whole.dtypes
Out[9]:
State                object
Country              object
Lat                 float64
Long                float64
Date         datetime64[ns]
Confirmed             int64
Deaths                int64
Recovered             int64
Active                int64
dtype: object

World wide overview of cases

In [10]:
num_cases = whole.groupby('Date')['Recovered', 'Deaths','Active'].sum().reset_index()
num_cases = num_cases.melt(id_vars="Date", value_vars=['Recovered', 'Deaths','Active'],
                 var_name='Case', value_name='Count')
In [11]:
fig = px.area(num_cases,x='Date',y='Count',color='Case', width=600, height=600,
              labels={'Count':'No. of cases'} ,title='No. of cases Vs Time', color_discrete_sequence = [rec, dth, act])
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()
In [12]:
num_cases_shift = whole.groupby('Date')['Confirmed'].sum().reset_index().shift(1)
num_cases = whole.groupby('Date')['Confirmed'].sum().reset_index()
num_cases = whole.groupby('Date')['Confirmed'].sum().reset_index()
num_cases.Confirmed = (num_cases-num_cases_shift)['Confirmed'].fillna(0)
In [13]:
fig = px.bar(num_cases,x='Date',y='Confirmed', width=600, height=600, 
             labels={'Confirmed':'No. of confirmed cases'}, title='Increase in no. of cases on daily basis',
             color_discrete_sequence = [cnf])
fig.show()
In [14]:
temp = whole[whole['Active']!=0]
num_country_shift = temp.groupby('Date').agg({'Country': 'count'}).reset_index().shift(1)
num_country = temp.groupby('Date').agg({'Country': 'count'}).reset_index()
num_country_plt = num_country.copy()
num_country_plt.Country = (num_country_plt-num_country_shift)['Country'].fillna(0)
num_country_plt.head()
Out[14]:
Date Country
0 2020-01-22 0.0
1 2020-01-23 7.0
2 2020-01-24 3.0
3 2020-01-25 3.0
4 2020-01-26 3.0
In [15]:
fig = px.bar(num_country_plt,x='Date',y='Country', width=600, height=600,
            labels={'Country':'No. of countries'}, 
            title='Increase/decrease in no. of countries <br>with covid-19 cases on daily basis',
            color_discrete_sequence = [dth])
fig.show()
In [16]:
fig = px.bar(num_country,x='Date',y='Country', width=600, height=600,
            labels={'Country':'No. of countries'}, 
            title='No. of countries with covid-19 cases Vs Time',
            color_discrete_sequence = [dth])
fig.show()

Top 10 countries ranked

In [17]:
top_10 = whole.groupby('Country')['Confirmed'].sum().reset_index()
top_10 = top_10.sort_values('Confirmed', ascending=False).iloc[:10,:]

labels = top_10.Country
fig = make_subplots(rows=1, cols=1, specs=[[{'type':'domain'}]])
fig.add_trace(go.Pie(labels=labels, values=top_10.Confirmed),
              1, 1)
fig.update_traces(hole=.4, hoverinfo="label+percent")

fig.update_layout(
    title_text="Top 10 countries with most Covid-19 cases",
    annotations=[dict(text='Covid-19 cases<br>distribution', x=0.5, y=0.5, font_size=20, showarrow=False)],
    autosize=False,
    width=700,
    height=700)
fig.show()
In [18]:
top_10_rec=whole[whole.Date=='2020-04-04'].groupby('Country').Recovered.sum().reset_index().sort_values('Recovered',ascending=False).iloc[:10]
top_10_act=whole[whole.Date=='2020-04-04'].groupby('Country').Active.sum().reset_index().sort_values('Active',ascending=False).iloc[:10]
top_10_dead=whole[whole.Date=='2020-04-04'].groupby('Country').Deaths.sum().reset_index().sort_values('Deaths',ascending=False).iloc[:10]
top_10_con=whole[whole.Date=='2020-04-04'].groupby('Country').Confirmed.sum().reset_index().sort_values('Confirmed',ascending=False).iloc[:10]
In [19]:
fig_rec = px.bar(top_10_rec,x='Country',y='Recovered', width=600, height=600, 
            title='No. of countries with most recoveries',
            color_discrete_sequence = [rec],text='Recovered')
fig_rec.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig_act = px.bar(top_10_act,x='Country',y='Active', width=600, height=600, 
            title='No. of countries with most active cases',
            color_discrete_sequence = [act],text='Active')
fig_act.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig_dead = px.bar(top_10_dead,x='Country',y='Deaths', width=600, height=600, 
            title='No. of countries with most deaths',
            color_discrete_sequence = [dth],text='Deaths')
fig_dead.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig_con = px.bar(top_10_con,x='Country',y='Confirmed', width=600, height=600, 
            title='No. of countries with most confirmed cases',
            color_discrete_sequence = [cnf],text='Confirmed')
fig_con.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig = make_subplots(rows=2, cols=2, shared_xaxes=False, horizontal_spacing=0.14, vertical_spacing=0.14,
                    subplot_titles=('Recovered', 'Active cases','Deaths reported','Confirmed cases'))

fig.add_trace(fig_rec['data'][0], row=1, col=1)
fig.add_trace(fig_act['data'][0], row=1, col=2)
fig.add_trace(fig_dead['data'][0], row=2, col=1)
fig.add_trace(fig_con['data'][0], row=2, col=2)

fig.update_layout(height=1000)

No. of days to reach n cases

In [20]:
def first_n(df,n):
    try:
        init_date = df.Date.iloc[0]
        mini = df[df.Confirmed >=n]
        fin_date = mini.Date.iloc[0]
        dt_time = fin_date-init_date
        return init_date,fin_date,dt_time.days,mini.Confirmed.iloc[0]
    except IndexError:
        return 0,0,0,0
In [21]:
hun_cnf = whole.groupby('Country').apply(first_n,100)
hun_cnf = pd.DataFrame({'Country':hun_cnf.index,'Val':hun_cnf.values})
hun_cnf[['init_date', 'fin_date','days','Confirmed']] = pd.DataFrame(hun_cnf['Val'].tolist(), index=hun_cnf.index)
hun_cnf = hun_cnf.drop('Val',axis=1)[hun_cnf.days!=0].sort_values('days')
fig = px.scatter(hun_cnf,x='Country',y='days',color='Confirmed',title='No. of days to reach 100 or more cases')
fig.show()
In [22]:
fhun_cnf = whole.groupby('Country').apply(first_n,500)
fhun_cnf = pd.DataFrame({'Country':fhun_cnf.index,'Val':fhun_cnf.values})
fhun_cnf[['init_date', 'fin_date','days','Confirmed']] = pd.DataFrame(fhun_cnf['Val'].tolist(), index=fhun_cnf.index)
fhun_cnf = fhun_cnf.drop('Val',axis=1)[fhun_cnf.days!=0].sort_values('days')
fhun_cnf.sample(4)
fig = px.scatter(fhun_cnf,x='Country',y='days',color='Confirmed',title='No. of days to reach 500 or more cases')
fig.show()
In [23]:
th_cnf = whole.groupby('Country').apply(first_n,1000)
th_cnf = pd.DataFrame({'Country':th_cnf.index,'Val':th_cnf.values})
th_cnf[['init_date', 'fin_date','days','Confirmed']] = pd.DataFrame(th_cnf['Val'].tolist(), index=th_cnf.index)
th_cnf = th_cnf.drop('Val',axis=1)[th_cnf.days!=0].sort_values('days')
th_cnf.sample(4)
fig = px.scatter(th_cnf,x='Country',y='days',color='Confirmed',title='No. of days to reach 1000 or more cases')
fig.show()

line graph for countries vs no. of cases

In [24]:
a=whole.groupby(['Country','Date'])['Recovered'].sum().reset_index()  
fig_rec=px.line(a,x='Date',y='Recovered',color='Country',title='Recovered cases',height=600)
fig_rec.show()
a=whole.groupby(['Country','Date'])['Active'].sum().reset_index()  
fig_act=px.line(a,x='Date',y='Active',color='Country',title='Active cases',height=600)
fig_act.show()
a=whole.groupby(['Country','Date'])['Deaths'].sum().reset_index()  
fig_dead=px.line(a,x='Date',y='Deaths',color='Country',title='Deaths cases',height=600)
fig_dead.show()
a=whole.groupby(['Country','Date'])['Confirmed'].sum().reset_index()  
fig_con=px.line(a,x='Date',y='Confirmed',color='Country',title='Confirmed cases', height=600)
fig_con.show()

Biggest one day jumps

In [25]:
def single_day_apply(df,feat):
    df = df.groupby('Date')[feat].sum()
    return df.rolling(window=2).apply(lambda x:x[1]-x[0]).max() 
def one_day_jump(df,feat,col):
    one_day_df = df.groupby('Country').apply(single_day_apply,feat)
    one_day_df = pd.DataFrame({'Country':one_day_df.index,'Count':one_day_df.values}).sort_values('Count',ascending=False)
    return one_day_df.head(20).style.set_caption('Top 20 one day jump in '+feat+' cases').hide_index().bar(subset=["Count"], color=col)
In [26]:
one_day_jump(whole.copy(),'Confirmed',cnf)
Out[26]:
Top 20 one day jump in Confirmed cases
Country Count
US 33264
France 25646
China 13628
Spain 9630
Germany 6933
Italy 6557
United Kingdom 4516
Iran 3186
Turkey 3013
Belgium 1850
Canada 1724
Austria 1321
Switzerland 1321
Brazil 1304
Netherlands 1179
Israel 1131
Portugal 1035
South Korea 851
Russia 771
Sweden 621
In [27]:
one_day_jump(whole.copy(),'Deaths',dth)
Out[27]:
Top 20 one day jump in Deaths cases
Country Count
France 1355
US 1320
Spain 961
Italy 919
United Kingdom 709
China 252
Belgium 192
Germany 187
Netherlands 175
Iran 158
Brazil 86
Turkey 79
Switzerland 75
Sweden 69
Canada 40
Portugal 37
Philippines 29
Algeria 28
Ecuador 27
Romania 23
In [28]:
one_day_jump(whole.copy(),'Recovered',rec)
Out[28]:
Top 20 one day jump in Recovered cases
Country Count
US 4945
Germany 4289
Spain 4096
China 3995
France 2194
Iran 1801
Italy 1632
Switzerland 1569
South Korea 1369
Denmark 894
Mexico 598
Austria 485
Belgium 436
Luxembourg 420
Peru 377
Turkey 302
Finland 290
Diamond Princess 285
Netherlands 247
Thailand 163
In [29]:
one_day_jump(whole.copy(),'Active',act)
Out[29]:
Top 20 one day jump in Active cases
Country Count
US 30266
France 23155
China 12241
Spain 7218
Germany 5873
Italy 5359
United Kingdom 3820
Turkey 2648
Iran 2391
Canada 1694
Belgium 1581
Switzerland 1299
Brazil 1218
Austria 1199
Israel 1124
Netherlands 1079
Portugal 1015
South Korea 851
Russia 720
India 553

Geo plotting the data

In [30]:
temp = whole[whole['Date'] == max(whole['Date'])]
latest_country = temp.groupby('Country')['Recovered','Confirmed','Deaths'].sum().reset_index()
fig = px.scatter_geo(latest_country, locations="Country",locationmode='country names',
                     size=np.log(latest_country.Confirmed),
                     color='Confirmed',
                     title='Confirmed cases around the globe',
                     projection="natural earth",hover_name='Country',color_continuous_scale=px.colors.sequential.Sunsetdark)
fig.show()
In [31]:
by_country = whole.groupby(['Date','Country'])['Recovered','Deaths','Confirmed','Active'].sum().reset_index()
fig = px.choropleth(by_country, locations="Country", locationmode='country names', 
                    color=np.log(by_country["Confirmed"]), 
                    hover_name="Country", animation_frame=by_country["Date"].dt.strftime('%d-%m-%Y'),
                    title='Animation of increase/decrease of cases on daily basis', 
                    color_continuous_scale=px.colors.sequential.Mint)
fig.update(layout_coloraxis_showscale=False)
fig.show()
In [32]:
loc=temp[temp.Country=='China']
fig = px.scatter_mapbox(loc, lat=loc.Lat,lon=loc.Long,
                     size=np.log(loc.Confirmed),
                     color='Confirmed', title = 'Distribution of confirmed cases in China',
                     hover_name='State',color_continuous_scale=px.colors.sequential.Sunsetdark)
fig.show()
In [33]:
loc=temp[temp.Country=='Australia']
fig = px.scatter_mapbox(loc, lat=loc.Lat,lon=loc.Long,
                     size=np.log(loc.Confirmed),title = 'Distribution of confirmed cases in Australia',
                     color='Confirmed',
                     hover_name='State',color_continuous_scale=px.colors.sequential.Sunsetdark)
fig.show()

Hypothesis : Spread of virus is directly proportional to the population

Lets test the hypothesis with Australia

In [34]:
raw = {'State':['New South Wales','Victoria','Queensland','Western Australia','South Australia','Tasmania','Australian Capital Territory','Northern Territory'],
       'Population':[7317500,5640900,4599400,2366900,1659800,511000,366900,231200]}
aus_pop = pd.DataFrame(raw)
aus_pop = aus_pop.merge(loc,on='State',how='left')
In [35]:
fig = px.parallel_coordinates(aus_pop[['Population','Confirmed']], width=600,height=600)
fig.show()

7 out of 8 states show direct proportionality of virus spread with population of the region, hence hypothesis is true for this case only.